1 Import packages and define useful functions¶

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

import plotly.express as px
import matplotlib.pyplot as plt
import yfinance as yf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import RegressorChain
from sklearn.ensemble import GradientBoostingRegressor
In [2]:
def convert_placeholder_text_to_num(text):
    '''
    Parsing helper script. In a lot of investing.com data e.g.6.1 M is used to indicate 6.1 million. This script converts the string 
    into float for machine learning to be carried out. 
    '''
    result = text
    try:
        if text[-1] == 'M':
            result = float(text[:-2]) * 10**6
        elif text[-1] == 'B':
            result = float(text[:-2]) * 10**9
        elif text[-1] == 'K':
            result = float(text[:-2]) * 10**3
        elif text[-1] == '-':
            result = np.nan
        elif text[-1] == 'nan':
            result = np.nan
        elif text[-1] == 'NA':
            result = np.nan
        elif text[-1] == 'x':
            result = float(text[:-1])
        elif text[-1] == '%':
            result = text.replace(",","")
            result = float(result[:-1])*0.01
    except Exception as e:
        print(e)
       
    return result
In [3]:
def get_historical_yfinance(ticker):
    '''
    From ticker string in dataframes above, download historical data (5Y), skip if it's not present
    '''
    
    try:
        ticker = f'{ticker:04}'+'.HK' # convert ticker to yfinace form
        asset = yf.Ticker(ticker)
        hist = asset.history(period="3Y")
    except error as e:
        print(e)
        print(str(ticker))
        print('\n')
        
    return hist

# old code to download historical price data (doesn't work)
#historical_price_data = {}

#for elem in test_merge.Ticker:
#    historical_price_data[elem] = get_historical_yfinance(elem)

2 Import datasets¶

2.1 Financial ratios¶

In [41]:
# import datasets

EV = pd.read_csv('data/ratios/EV.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
fcf = pd.read_csv('data/ratios/fcf.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
ebitda = pd.read_csv('data/ratios/ebitda.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
revenue = pd.read_csv('data/ratios/revenue.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
roe = pd.read_csv('data/ratios/roe.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
gross_profit_margin = pd.read_csv('data/ratios/gross-profit-margin.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
quick_ratio = pd.read_csv('data/ratios/quick-ratio.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')
debt_to_equity = pd.read_csv('data/ratios/debt-to-equity.csv', index_col=0, parse_dates=True, infer_datetime_format=True, thousands=',')

# remove duplicates
EV = EV.drop_duplicates()
fcf = fcf.drop_duplicates()
ebitda = ebitda.drop_duplicates()
revenue = revenue.drop_duplicates()
roe = roe.drop_duplicates()
gross_profit_margin = gross_profit_margin.drop_duplicates()
quick_ratio = quick_ratio.drop_duplicates()
debt_to_equity = debt_to_equity.drop_duplicates()

# set up for merging dataframes
EV = EV.reset_index()
fcf = fcf.reset_index()
ebitda = ebitda.reset_index()
revenue = revenue.reset_index()
roe = roe.reset_index()
gross_profit_margin = gross_profit_margin.reset_index()
quick_ratio = quick_ratio.reset_index()
debt_to_equity = debt_to_equity.reset_index()

EV = EV.astype(str)
fcf = fcf.astype(str)
ebitda = ebitda.astype(str)
revenue = revenue.astype(str)
roe = roe.astype(str)
gross_profit_margin = gross_profit_margin.astype(str)
quick_ratio = quick_ratio.astype(str)
debt_to_equity = debt_to_equity.astype(str)
In [5]:
# merge dataframes into one big one

dataframes = [fcf,ebitda,revenue,roe,gross_profit_margin,quick_ratio,debt_to_equity]
test_merge = EV

for elem in dataframes:
    test_merge = pd.merge(test_merge, elem, how = 'outer',
                      left_on=['Ticker','Name','Sector','Industry','Industry Group','Market Cap','Last Reported Fiscal Year'],
                      right_on=['Ticker','Name','Sector','Industry','Industry Group','Market Cap','Last Reported Fiscal Year'])
    
    
    
#test_merge = pd.merge(EV.reset_index(), fcf.reset_index(), how = 'inner',
                      #left_on=['Ticker','Name','Sector','Industry','Industry Group','Market Cap','Last Reported Fiscal Year'],
                      #right_on=['Ticker','Name','Sector','Industry','Industry Group','Market Cap','Last Reported Fiscal Year'])
In [6]:
# dataset for ML all sectors
# test_merge.dropna().to_csv('test.csv')

2.2 Download historical prices¶

In [7]:
# See comments in next cell
# ticker_string = ''
# 
# for elem in test_merge.Ticker:
#     ticker_string = ticker_string + f'{elem:04}'+'.HK '
# 
# ticker_string = ticker_string[:-1] # drop the last white space character
In [8]:
# original code to download all historical price data.
# DON'T RUN! Very slow, import the csv instead (next cell). 

#test_assets = yf.download(tickers = ticker_string,  # list of tickers
#            period = "5y",         # time period
#            interval = "1mo",       # trading interval
#            ignore_tz = True,      # ignore timezone when aligning data from different exchanges?
#            prepost = False)       # download pre/post market hours data?
In [9]:
#test_assets.to_pickle('data/price_data_complete.pickle')
test_assets = pd.read_pickle('data/price_data_complete.pickle')
#test_assets = test_assets.set_index('Date')
In [10]:
test_assets.head()
Out[10]:
Adj Close ... Volume
1000.HK 1003.HK 1005.HK 1010.HK 1020.HK 1023.HK 1029.HK 1030.HK 1033.HK 1034.HK ... 9906.HK 9908.HK 9910.HK 9913.HK 9920.HK 9930.HK 9936.HK 9950.HK 9980.HK 9986.HK
Date
2018-04-01 2.76 2.00 2.509840 2.06 0.270 1.224793 0.168 5.405748 1.16 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2018-05-01 3.25 1.97 2.641937 2.18 0.270 1.210384 0.148 5.060175 1.27 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2018-06-01 3.00 2.00 2.641937 2.25 0.260 1.210384 0.134 6.030394 1.05 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2018-07-01 2.40 2.09 2.457001 2.22 0.249 1.628255 0.137 5.987926 1.07 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2018-08-01 2.50 1.87 2.457001 2.12 0.218 1.621050 0.113 5.155562 1.01 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 5202 columns

In [11]:
asset_prices = test_assets['Adj Close'].dropna(axis=1)
In [12]:
ML_data = test_merge.dropna().applymap(convert_placeholder_text_to_num).dropna()
could not convert string to float: ''
could not convert string to float: ''
could not convert string to float: ''
could not convert string to float: ''
could not convert string to float: ''
could not convert string to float: ''
In [13]:
ML_data.head()
Out[13]:
Ticker Name Sector Industry Industry Group Market Cap Last Reported Fiscal Year EV EV FQ-1 EV FQ-2 ... Debt / Equity FQ-3 Debt / Equity FQ-4 Debt / Equity FQ-5 Debt / Equity FQ-6 Debt / Equity FQ-7 Debt / Equity FQ-8 Debt / Equity FQ-9 Debt / Equity FQ-10 Debt / Equity FQ-11 Debt / Equity FQ-12
0 199 ITC Properties Group Limited Real Estate Real Estate Management & Development Real Estate 1.221000e+08 2023 4.833800e+07 75623000.0 101000000.0 ... 0.515 0.899 0.899 0.773 0.773 0.836 0.836 0.839 0.839 0.847
1 2322 Hong Kong ChaoShang Group Limited Industrials Trading Companies & Distributors Capital Goods 3.832000e+08 2023 3.652000e+08 438800000.0 409700000.0 ... 0.015 0.007 0.007 0.021 0.021 0.028 0.028 0.037 0.037 0.044
5 179 Johnson Electric Holdings Limited Consumer Discretionary Auto Components Automobiles & Components 1.056000e+09 2023 1.265000e+09 1435000000.0 1384000000.0 ... 0.256 0.28 0.28 0.213 0.213 0.247 0.247 0.26 0.26 0.224
9 989 Hua Yin International Holdings Limited Real Estate Real Estate Management & Development Real Estate 3.854000e+08 2023 5.113000e+08 592700000.0 537900000.0 ... -4.184 -4.0 -4.0 -3.746 -3.746 -1.337 -1.337 -1.472 -1.472 1.822
10 105 Associated International Hotels Limited Real Estate Real Estate Management & Development Real Estate 4.297000e+08 2023 3.898000e+08 496900000.0 536100000.0 ... 0.021 0.02 0.02 0.019 0.019 0.017 0.017 0.016 0.016 0.015

5 rows × 111 columns

In [14]:
ratio_ticker_list = list(ML_data.Ticker)
ratio_ticker_list_new = []

for elem in ratio_ticker_list:
    ratio_ticker_list_new.append(elem+'.HK')
In [15]:
asset_ticker_list = list(asset_prices.columns)

combined_tickers = [value for value in asset_ticker_list if value in ratio_ticker_list_new]
In [16]:
combined_tickers
combined_ticker_int = []

for elem in combined_tickers:
    combined_ticker_int.append(elem[:-3])
In [17]:
ML_final = ML_data[ML_data['Ticker'].isin(combined_ticker_int)]
In [18]:
# gives 61 assets but I haven't dropped NaNs in the financial ratios dataset. Final filtered dataset is expected to contain fewer assets. Edit: 7. 
asset_prices[combined_tickers].head()
Out[18]:
1023.HK 1140.HK 1243.HK 1373.HK 1468.HK 1499.HK 2322.HK
Date
2018-04-01 1.224793 2.984653 0.852034 1.133839 0.239 0.60 0.60
2018-05-01 1.210384 3.051189 0.790193 1.233674 0.237 0.91 0.70
2018-06-01 1.210384 2.956137 0.755837 1.219412 0.227 0.85 0.67
2018-07-01 1.628255 2.861084 0.755837 1.390557 0.246 0.72 0.60
2018-08-01 1.621050 2.823063 0.748966 1.411951 0.355 0.67 0.58

3 PCA / LDA analysis¶

3.1 LDA¶

In [19]:
PCA_LDA_df = test_merge[['Ticker', 'Sector', 'Industry', 'Market Cap', 'EV',\
           'FCF', 'EBITDA', 'Revenue', 'ROE', 'Gross-Profit-Margin',\
            'Quick-Ratio', 'Debt / Equity']]

# remove NaNs, replace placeholder text with floats

PCA_LDA_df = PCA_LDA_df.dropna().applymap(convert_placeholder_text_to_num).dropna()

# set up datasets for LDA 

#X = dataset.iloc[:, 0:4].values
#y = dataset.iloc[:, 4].values

X = PCA_LDA_df[['Market Cap', 'EV', 'FCF', 'EBITDA', 'Revenue', 'ROE', 'Gross-Profit-Margin',\
            'Quick-Ratio', 'Debt / Equity']].values
y = PCA_LDA_df[['Sector']].values
y = y.flatten()

# scaler = StandardScaler()
# X_lda = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lda = LDA(n_components=7)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
In [20]:
plt.plot(lda.explained_variance_ratio_)
Out[20]:
[<matplotlib.lines.Line2D at 0x7fe106ee5a20>]
In [ ]:
lda = LinearDiscriminantAnalysis(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

colors = ["navy", "turquoise", "darkorange"]
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(
        X_r2[y == i, 0], X_r2[y == i, 1], alpha=0.8, color=color, label=target_name
    )
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("LDA of IRIS dataset")
In [ ]:
X;

3.2 PCA¶

In [22]:
# taken from https://stackoverflow.com/questions/39216897/plot-pca-loadings-and-loading-in-biplot-in-sklearn-like-rs-autoplot

def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley)#, c = y)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()
In [23]:
scaler = StandardScaler()
scaler.fit(X)
X_pca=scaler.transform(X)    
pca = PCA()
X_pca = pca.fit_transform(X_pca)
In [24]:
myplot(X_pca[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()
In [25]:
# Using plotly

components = pca.fit_transform(X_pca)
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(5),
    color=y.flatten()
)
fig.update_traces(diagonal_visible=False)
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,)
fig.show()
In [26]:
# Percentage of variance explained for each components
print("explained variance ratio: %s" % str(pca.explained_variance_ratio_))
explained variance ratio: [5.39614297e-01 2.21427804e-01 1.13341854e-01 9.61741960e-02
 1.36704757e-02 1.10874438e-02 3.20397805e-03 1.29777607e-03
 1.82176038e-04]
In [42]:
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('n_component');
plt.ylabel('Explained variance ratio');
plt.savefig('pca_scree.png',dpi=600)

4 Basic machine learning model¶

4.1 Data pre-processing¶

Resample to quarterly intervals.

In [28]:
# test code

asset_prices = test_assets['Adj Close'].dropna(axis=1)

# resample monthly to quarterly intervals
asset_prices = asset_prices[['2322.HK','1023.HK','1468.HK','1243.HK','1373.HK','1140.HK','1499.HK']].resample('Q').last()

# drop the last quarter
asset_prices = asset_prices.iloc[:-1]
In [29]:
# test code cont.
# reverse the order of the df, calculate % change i.e. returns, then select datapoints corresponding to financial ratios. 
returns_rhs = asset_prices[::-1].pct_change(-1).iloc[:13].rename(columns={'2322.HK': '2322.HK ret', '1023.HK': '1023.HK ret',\
                                                                                             '1468.HK': '1468.HK ret','1243.HK': '1243.HK ret',\
                                                                                             '1373.HK': '1373.HK ret','1140.HK': '1140.HK ret',\
                                                                                             '1499.HK': '1499.HK ret'})
In [30]:
def get_train_test_from_fratio(fratio):
    '''
    Given a financial ratio (list below), return a dataframe [X1,X2,...,y1,y2,...] where Xi are the financial ratios and yi are the % returns for the asset.
    Vertical axis is time (in decreasing order)
    
    Financial ratios: 
    - EV
    - FCF
    - EBITDA
    - Revenue
    - ROE
    - Gross-Profit-Margin
    - Quick-Ratio
    - Debt / Equity
    '''
    
    EV_lhs = ML_final.loc[:,ML_final.columns.str.contains(fratio)].transpose().reset_index().rename(columns={1: '2322.HK', 13: '1023.HK',\
                                                                                             18: '1468.HK',20: '1243.HK',\
                                                                                             25: '1373.HK',57: '1140.HK',\
                                                                                             96: '1499.HK'})
    # reverse the order of the df, calculate % change i.e. returns, then select datapoints corresponding to financial ratios. 
    returns_rhs = asset_prices[::-1].pct_change(-1).iloc[:13].rename(columns={'2322.HK': '2322.HK ret', '1023.HK': '1023.HK ret',\
                                                                                                 '1468.HK': '1468.HK ret','1243.HK': '1243.HK ret',\
                                                                                                 '1373.HK': '1373.HK ret','1140.HK': '1140.HK ret',\
                                                                                                 '1499.HK': '1499.HK ret'})
    
    df = pd.concat([EV_lhs, returns_rhs.reset_index()],axis=1)[['Date','2322.HK','2322.HK','1023.HK','1468.HK','1243.HK','1373.HK',\
                                                       '1140.HK','1499.HK','2322.HK ret','1023.HK ret','1468.HK ret',\
                                                       '1243.HK ret','1373.HK ret','1140.HK ret','1499.HK ret']].set_index('Date')
    return df

Define ML data each frame refers to a particular financial ratio

In [39]:
X_EV = get_train_test_from_fratio('EV')
X_FCF = get_train_test_from_fratio('FCF')
X_EBIT = get_train_test_from_fratio('EBITDA')
X_REV = get_train_test_from_fratio('Revenue')
X_ROE = get_train_test_from_fratio('ROE')
X_GPM = get_train_test_from_fratio('Gross-Profit-Margin')
X_QUICK = get_train_test_from_fratio('Quick-Ratio')
X_DE = get_train_test_from_fratio('Debt / Equity')

X_EV.to_csv('data/X_EV_MLdata.csv')
X_FCF.to_csv('data/X_FCF_MLdata.csv')
X_EBIT.to_csv('data/X_EBIT_MLdata.csv')
X_REV.to_csv('data/X_REV_MLdata.csv')
X_ROE.to_csv('data/X_ROE_MLdata.csv')
X_GPM.to_csv('data/X_GPM_MLdata.csv')
X_QUICK.to_csv('data/X_QUICK_MLdata.csv')
X_DE.to_csv('data/X_DE_MLdata.csv')
In [34]:
# taken from https://towardsdatascience.com/the-complete-guide-to-time-series-forecasting-using-sklearn-pandas-and-numpy-7694c90e45c1

def ts_test_train_split(df):
    '''
    For dataframe of the format [X1,X2,...,y1,y2,...] generated from get_train_test_from_fratio, return training and test set of data
    '''
    X_cols = [col for col in df.columns if col.endswith('HK')]

    y_cols = [col for col in df.columns if col.endswith('ret')]
    X_train = df[X_cols][:-2].values
    y_train = df[y_cols][:-2].values

    X_test = df[X_cols][-2:].values
    y_test = df[y_cols][-2:].values
    
    return X_train, y_train, X_test, y_test
In [35]:
X_train, y_train, X_test, y_test = ts_test_train_split(X_EV)
In [36]:
y_test
Out[36]:
array([[ 0.65999997, -0.55288456, -0.20779217,  0.1025641 , -0.03719925,
        -0.25999999, -0.17543859],
       [ 0.19047623, -0.1476528 ,  0.56504057, -0.18771023,  0.01966014,
        -0.04458602,  0.4690721 ]])
In [37]:
def ML_decision_tree_boosting(X_train,y_train,X_test,y_test):
    '''
    Run a decision tree regressor and gradient boosting regressor and return coefficient of determination on both training / test sets.
    Gradient boosting regressor is chained so the model will try to predict sequential returns from past inputs.
    
    See: https://towardsdatascience.com/the-complete-guide-to-time-series-forecasting-using-sklearn-pandas-and-numpy-7694c90e45c1
    '''
    
    dt_seq = DecisionTreeRegressor(random_state=42)
    dt_seq.fit(X_train, y_train)
    dt_seq_preds = dt_seq.predict(X_test)
    
    gbr_seq = GradientBoostingRegressor(random_state=42)
    chained_gbr = RegressorChain(gbr_seq)
    chained_gbr.fit(X_train, y_train)
    gbr_seq_preds = chained_gbr.predict(X_test)
    
    dt_train_score = dt_seq.score(X_train,y_train)
    dt_test_score = dt_seq.score(X_test,y_test)
    cgbr_train_score = chained_gbr.score(X_train,y_train)
    cgbr_test_score = chained_gbr.score(X_test,y_test)
    
    return dt_train_score, dt_test_score, cgbr_train_score, cgbr_test_score
In [40]:
# apply ML function to all financial ratio dataframes
financial_ratios_dict = {'X_EV': X_EV,
                        'X_FCF': X_FCF,
                        'X_EBIT': X_EBIT,
                        'X_REV': X_REV,
                        'X_ROE': X_ROE,
                        'X_GPM': X_GPM,
                        'X_QUICK': X_QUICK,
                        'X_DE': X_DE}

for elem in financial_ratios_dict.keys():
    X_train, y_train, X_test, y_test = ts_test_train_split(financial_ratios_dict[elem])
    dt_train_score, dt_test_score, cgbr_train_score, cgbr_test_score = ML_decision_tree_boosting(X_train,y_train,X_test,y_test)
    print(str(elem) + ' dt train score: ' +str(dt_train_score) + ', dt test score: ' + str(dt_test_score)+\
          ', cgbr train score: '+ str(cgbr_train_score) + ', cgbr test score: '+str(cgbr_test_score))
X_EV dt train score: 1.0, dt test score: -2.581986121565467, cgbr train score: 0.9999999956582766, cgbr test score: -5.163514426569127
X_FCF dt train score: 1.0, dt test score: -5.058310243181415, cgbr train score: 0.9999999947948892, cgbr test score: -7.744285351237203
X_EBIT dt train score: 1.0, dt test score: -8.477140958532724, cgbr train score: 0.9999998966063013, cgbr test score: -9.077284584457173
X_REV dt train score: 1.0, dt test score: -6.648629112063416, cgbr train score: 0.999999993985728, cgbr test score: -5.569041106992259
X_ROE dt train score: 1.0, dt test score: -6.648629112063416, cgbr train score: 0.9999999968451213, cgbr test score: -4.433750342075148
X_GPM dt train score: 0.6504882948750798, dt test score: -8.31246456204073, cgbr train score: 0.42253245611372925, cgbr test score: -4.963616181777584
X_QUICK dt train score: 0.6112198297949563, dt test score: -6.648629112063416, cgbr train score: 0.2922739158006787, cgbr test score: -5.162086510747484
X_DE dt train score: 0.6112198297949563, dt test score: -6.648629112063416, cgbr train score: 0.29300018255057064, cgbr test score: -9.32353157453062